This is a two-part data engineering and analysis of the Stack Overflow Developer Survey 2020, using Python and PostgreSQL.
The first part is the data engineering part. It consists in data pre-processing, creation of a PostgreSQL database and insertion of the data in said database. This is achieved with pandas, NumPy the psycopg2 driver. Part one is available here.
The second part is the data analysis parts. It consists in extracting data from the database, namely for analysis and visualization using Plotly, with key takeaways at the end of each section.
There were a total of 64461 respondents for a total of 61 questions. However, less than 30 questions are considered. The main purpose of this analysis is to explore who are the respondents, such as how they use code professionally, their country, academic background, technologies they work with, etc.
The original dataset can be downloaded from Stack Overflow's website here.
from configparser import ConfigParser
import psycopg2
import psycopg2.extras as psql_extras
import pandas as pd
import numpy as np
import plotly.express as px
from typing import Dict, List, Tuple
import plotly.offline as pyo
pyo.init_notebook_mode()
def get_relative_frequency_by_sum(
df: pd.DataFrame,
str_query: str,
frequency_column: str,
precision: int = 2
) -> float:
"""
Calculate the relative frequency (in percentage) for values of a DataFrame.
"""
data_percentage = (df.query(str_query)[frequency_column].sum() * 100) / df[frequency_column].sum()
data_percentage = round(data_percentage, precision)
return data_percentage
def get_relative_frequency_by_count(
df: pd.DataFrame,
str_query: str,
frequency_column: str,
num_respondents: int,
precision: int = 2
) -> float:
"""
Calculate the relative frequency (in percentage) for values of a DataFrame.
"""
data_percentage = (df.query(str_query)[frequency_column].sum() * 100) / num_respondents
data_percentage = round(data_percentage, precision)
return data_percentage
def load_conn_info(ini_filename: str) -> Dict[str, str]:
"""
Load connection variables from an .ini file.
"""
parser = ConfigParser()
parser.read(ini_filename)
conn_info = {param[0]: param[1] for param in parser.items("postgresql")}
return conn_info
def connect_db(conn_info: Dict[str, str]) -> psycopg2.extensions.connection:
"""
Connect to the database.
"""
try:
conn = psycopg2.connect(**conn_info)
print(f"Successfully connected to `{conn_info['database']}` database")
except Exception as error:
print(f"{type(error).__name__}: {error}")
return conn
def get_table_info(cursor, table_name: str) -> Tuple[int, List]:
"""
Get the number of rows and a list of the existing columns in a table from anime_db.
"""
cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
num_rows = cursor.fetchone()[0]
cursor.execute(f"SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}';")
col_names = [result[0] for result in cursor.fetchall()]
return (num_rows, col_names)
def get_data_from_db(
connection: psycopg2.extensions.connection,
cursor: psycopg2.extensions.cursor,
query: str,
df: pd.DataFrame,
col_names: List[str]
) -> pd.DataFrame:
"""
Query the database and store all the returned rows in a pandas DataFrame.
"""
try:
cursor.execute(query)
while True:
# Fetch the next 100 rows from the query results
query_results = cursor.fetchmany(100)
# If an empty list is returned, then we've reached the end of the results
if query_results == list():
break
# Create a list of dictionaries that represent rows from the database, mapping column names to values
results_mapped = [
{
col_names[i]: row[i]
for i in range(len(col_names))
}
for row in query_results
]
# Append the fetched rows to the DataFrame
df = df.append(results_mapped, ignore_index=True)
return df
except Exception as error:
print(f"{type(error).__name__}: {error}")
print("Query:", cursor.query)
connection.rollback()
# Load connection information from an .ini file
conn_info = load_conn_info("db.ini")
# Connect to the database
connection = connect_db(conn_info)
# Create a cursor
cursor = connection.cursor()
col_names = ["gender", "gender_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT gender, COUNT(gender) AS gender_count
FROM respondents
WHERE gender IS NOT NULL
GROUP BY gender;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.pie(
df,
values="gender_count",
names="gender",
title = "Gender of Respondents",
color_discrete_sequence=["rgb(0,154,152)", "rgb(0,120,164)", "rgb(0,84,176)", "rgb(0,44,190)", "rgb(0,3,129)"]
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["age", "age_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT age, COUNT(age) AS age_count
FROM respondents
WHERE age BETWEEN 10 AND 80
GROUP BY age;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
# Calculate the percentage of respondents between the ages of 20 and 30
age_20_30 = get_relative_frequency_by_sum(df, "age >= 20 and age <=30", "age_count")
fig = px.bar(
df,
x = "age",
y = "age_count",
title="Age of Respondents"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["country", "country_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT country, COUNT(country) AS country_count
FROM respondents
GROUP BY country
ORDER BY country_count DESC
LIMIT 15;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="country_count", ascending=True, inplace=True)
fig = px.bar(
df,
x="country_count",
y="country",
title=f"Top {df.shape[0]} Countries with most Respondents",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["ed_level", "ed_level_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT ed_level, COUNT(ed_level) AS ed_level_count
FROM respondents
GROUP BY ed_level;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="ed_level_count", ascending=True, inplace=True)
df["ed_level"].unique()[-1]
# Calculate the percentage of bachelor and masters
ed_level_bachelor = (df.loc[df["ed_level"] == "Bachelor's degree", "ed_level_count"] * 100) / df["ed_level_count"].sum()
ed_level_bachelor = round(ed_level_bachelor.array[0], 2)
ed_level_masters = (df.loc[df["ed_level"] == "Master's degree", "ed_level_count"] * 100) / df["ed_level_count"].sum()
ed_level_masters = round(ed_level_masters.array[0], 2)
fig = px.bar(
df,
x="ed_level_count",
y="ed_level",
title="Education Level of Respondents",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["hobbyist", "hobbyist_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT hobbyist, COUNT(hobbyist) AS hobbyist_count
FROM respondents
WHERE hobbyist IS NOT NULL
GROUP BY hobbyist;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.pie(
df,
values="hobbyist_count",
names="hobbyist",
title = "Do you code as a hobby?",
color_discrete_sequence=["rgb(0,154,152)", "rgb(0,120,164)", "rgb(0,84,176)", "rgb(0,44,190)", "rgb(0,3,129)"]
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["prog_language", "prog_language_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT prog_language, COUNT(prog_language) AS prog_language_count
FROM prog_languages
GROUP BY prog_language
ORDER BY prog_language_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.bar(
df,
x="prog_language",
y="prog_language_count",
title=f"{df.shape[0]} Most Used Programming Languages",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["database", "database_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT database, COUNT(database) AS database_count
FROM databases
GROUP BY database
ORDER BY database_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.bar(
df,
x="database",
y="database_count",
title=f"{df.shape[0]} Most Used Databases",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["web_fw", "web_fw_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT web_fw, COUNT(web_fw) AS web_fw_count
FROM web_fws
GROUP BY web_fw
ORDER BY web_fw_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.bar(
df,
x="web_fw",
y="web_fw_count",
title=f"{df.shape[0]} Most Used Web Frameworks",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["collab_tool", "collab_tool_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT collab_tool, COUNT(collab_tool) AS collab_tool_count
FROM collab_tools
GROUP BY collab_tool
ORDER BY collab_tool_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.bar(
df,
x="collab_tool",
y="collab_tool_count",
title=f"{df.shape[0]} Most Used Collaboration Tools",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["platform", "platform_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT platform, COUNT(platform) AS platform_count
FROM platforms
GROUP BY platform
ORDER BY platform_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.bar(
df,
x="platform",
y="platform_count",
title=f"{df.shape[0]} Most Used Platforms",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["op_sys", "op_sys_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT op_sys, COUNT(op_sys) AS op_sys_count
FROM respondents
GROUP BY op_sys
ORDER BY op_sys_count DESC;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
# Calculate the percentage of Windows, Linux and MacOS respondents
op_sys_windows = get_relative_frequency_by_sum(df, "op_sys == 'Windows'", "op_sys_count")
op_sys_linux = get_relative_frequency_by_sum(df, "op_sys == 'Linux-based'", "op_sys_count")
op_sys_macos = get_relative_frequency_by_sum(df, "op_sys == 'MacOS'", "op_sys_count")
fig = px.bar(
df,
x="op_sys",
y="op_sys_count",
title="Most Used Operating Systems",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["employment_status", "employment_status_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT employment_status, COUNT(employment_status) AS employment_status_count
FROM respondents
GROUP BY employment_status
ORDER BY employment_status_count DESC;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="employment_status_count", ascending=True, inplace=True)
employment_status_fulltime = get_relative_frequency_by_sum(df, "employment_status == 'Employed full-time'", "employment_status_count")
employment_status_student = get_relative_frequency_by_sum(df, "employment_status == 'Student'", "employment_status_count")
fig = px.bar(
df,
x="employment_status_count",
y="employment_status",
title="Employment Status of Respondents",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
# Get the count of unique respondents
col_names = ["num_respondents"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT COUNT(DISTINCT respondent_id)
FROM dev_roles;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
num_respondents = df.loc[0, "num_respondents"]
col_names = ["dev_role", "dev_role_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT dev_role, COUNT(dev_role) AS dev_role_count
FROM dev_roles
GROUP BY dev_role
ORDER BY dev_role_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="dev_role_count", ascending=True, inplace=True)
dev_role_backend = get_relative_frequency_by_count(
df,
"dev_role == 'Back-end developer'",
"dev_role_count",
num_respondents)
dev_role_fullstack = get_relative_frequency_by_count(
df,
"dev_role == 'Full-stack developer'",
"dev_role_count",
num_respondents)
dev_role_frontend = get_relative_frequency_by_count(
df,
"dev_role == 'Front-end developer'",
"dev_role_count",
num_respondents)
fig = px.bar(
df,
x="dev_role_count",
y="dev_role",
title=f"{df.shape[0]} Most Common Developer Roles",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["yearly_compensation"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT yearly_compensation
FROM respondents
WHERE yearly_compensation BETWEEN 0 AND 200000;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
# Create the bin labels
bin_labels = [f"[{int(i/1000):,}K, {int((i+15000)/1000):,}K)" for i in range(0, 200001, 15000)]
# Create the bin intervals
data_bins = pd.IntervalIndex.from_tuples(
[(i, i+15000) for i in range(0, 200001, 15000)],
closed="left"
)
# Bin the data based on the intervals created
df_binned = pd.cut(
df["yearly_compensation"], data_bins,
right=True,
precision=0,
include_lowest=True
)
# Sort the binned data in ascending order
df_binned.sort_values(ascending=True, inplace=True)
# Change the values from categorical to string to be able to plot them
df_binned = df_binned.astype("str")
fig = px.histogram(
x = df_binned,
title="Annual Compensation (USD)",
)
fig.update_layout(
xaxis = {
"tickmode": "array",
"tickvals": df_binned.unique(),
"ticktext": bin_labels
},
xaxis_title = "",
yaxis_title = "",
title_x = 0.5,
bargap = 0
)
fig.update_traces(
marker = {
"line": {
"width": 2,
"color": "DarkSlateGrey"
}
}
)
fig.show()
col_names = ["prog_language", "compensation_avg"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT
prog_languages.prog_language AS prog_language,
AVG(respondents.yearly_compensation) AS compensation_avg
FROM prog_languages
INNER JOIN respondents
ON prog_languages.respondent_id = respondents.id
WHERE yearly_compensation BETWEEN 0 AND 200000
GROUP BY prog_language
ORDER BY compensation_avg DESC
LIMIT 20;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="compensation_avg", ascending=True, inplace=True)
fig = px.bar(
df,
x="compensation_avg",
y="prog_language",
title="Average Annual Compensation by Programming Language (Top 20)",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["dev_role", "compensation_avg"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT
dev_roles.dev_role AS dev_role,
AVG(respondents.yearly_compensation) AS compensation_avg
FROM dev_roles
INNER JOIN respondents
ON dev_roles.respondent_id = respondents.id
WHERE yearly_compensation BETWEEN 0 AND 200000
GROUP BY dev_role
ORDER BY compensation_avg DESC;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="compensation_avg", ascending=True, inplace=True)
fig = px.bar(
df,
x="compensation_avg",
y="dev_role",
title="Average Annual Compensation by Developer Role",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["work_week_hrs"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT work_week_hrs
FROM respondents
WHERE work_week_hrs BETWEEN 0 AND 100;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
# Create the bin labels
bin_labels = [f"[{i}, {i+10})" for i in range(0, 101, 10)]
# Create the bin intervals
data_bins = pd.IntervalIndex.from_tuples(
[(i, i+10) for i in range(0, 101, 10)],
closed="left"
)
# Bin the data based on the intervals created
df_binned = pd.cut(
df["work_week_hrs"], data_bins,
right=True,
precision=0,
include_lowest=True
)
# Sort the binned data in ascending order
df_binned.sort_values(ascending=True, inplace=True)
# Change the values from categorical to string to be able to plot them
df_binned = df_binned.astype("str")
fig = px.histogram(
x = df_binned,
title="Hours of Work per Week",
)
fig.update_layout(
xaxis = {
"tickmode": "array",
"tickvals": df_binned.unique(),
"ticktext": bin_labels
},
xaxis_title = "",
yaxis_title = "",
title_x = 0.5,
bargap = 0
)
fig.update_traces(
marker = {
"line": {
"width": 2,
"color": "DarkSlateGrey"
}
}
)
fig.show()
col_names = ["overtime", "overtime_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT overtime, COUNT(overtime) AS overtime_count
FROM respondents
WHERE overtime IS NOT NULL
GROUP BY overtime;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.pie(
df,
values="overtime_count",
names="overtime",
title = "Overtime Work Frequency",
color_discrete_sequence=["rgb(0,154,152)", "rgb(0,120,164)", "rgb(0,84,176)", "rgb(0,44,190)", "rgb(0,3,129)"]
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["undegrad_major", "undegrad_major_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT undegrad_major, COUNT(undegrad_major) AS undegrad_major_count
FROM respondents
GROUP BY undegrad_major
ORDER BY undegrad_major_count DESC;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="undegrad_major_count", ascending=True, inplace=True)
fig = px.bar(
df,
x="undegrad_major_count",
y="undegrad_major",
title="Undergrad Major of Respondents",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
col_names = ["stuck_solution", "stuck_solution_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT stuck_solution, COUNT(stuck_solution) AS stuck_solution_count
FROM stuck_solutions
GROUP BY stuck_solution
ORDER BY stuck_solution_count DESC;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
fig = px.bar(
df,
x="stuck_solution",
y="stuck_solution_count",
title="Solutions Used When Stuck",
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
# Get the count of unique respondents
col_names = ["num_respondents"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT COUNT(DISTINCT respondent_id)
FROM dev_roles;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
num_respondents = df.loc[0, "num_respondents"]
col_names = ["job_factor", "job_factor_count"]
df = pd.DataFrame(columns=col_names)
query = """
SELECT job_factor, COUNT(job_factor) AS job_factor_count
FROM job_factors
GROUP BY job_factor
ORDER BY job_factor_count DESC
LIMIT 10;
"""
df = get_data_from_db(connection, cursor, query, df, col_names)
df.sort_values(by="job_factor_count", ascending=True, inplace=True)
job_factor_tech = get_relative_frequency_by_count(
df,
"job_factor == 'Technical compatibility'",
"job_factor_count",
num_respondents)
job_factor_culture = get_relative_frequency_by_count(
df,
"job_factor == 'Company culture'",
"job_factor_count",
num_respondents)
job_factor_schedule = get_relative_frequency_by_count(
df,
"job_factor == 'Flexible schedule'",
"job_factor_count",
num_respondents)
fig = px.bar(
df,
x="job_factor_count",
y="job_factor",
title=f"{df.shape[0]} Most Important Job Factors",
orientation="h"
)
fig.update_layout(
xaxis_title = "",
yaxis_title = "",
title_x = 0.5
)
fig.show()
connection.close()
cursor.close()